package cn.xo68.boot.webgather.job;

import cn.xo68.boot.job.JobProvider;
import cn.xo68.boot.job.entity.QuartzJob;
import cn.xo68.boot.webgather.common.GatherStatusEnums;
import cn.xo68.boot.webgather.document.WebGatherLinkDoc;
import cn.xo68.boot.webgather.entity.GatherListPageConfig;
import cn.xo68.boot.webgather.resolve.ContentResolve;
import cn.xo68.boot.webgather.resolve.ResolveFactory;
import cn.xo68.boot.webgather.service.WebGatherLinkService;
import cn.xo68.core.date.DateTime;
import cn.xo68.core.util.JsonUtil;
import cn.xo68.core.util.StringTools;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.quartz.JobExecutionContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
import org.springframework.stereotype.Component;

import java.net.URL;

/**
 * 页面列表任务提供者
 * @author wuxie
 * @date 2018-7-19
 * { "resolveType":"OK_HTTP","url": "https://blog.csdn.net/nnsword/article/list/${pageIndex}", "hasPage": true, "minPageIndex": 1, "maxPageIndex": 3, "listQuery": ".article-list div h4 a", "contentTitle": "H1.title-article", "contentBody": "#article_content div.markdown_views" }
 */
@ConditionalOnMissingBean(name = "webListPageGatherJobProvider")
@Component("webListPageGatherJobProvider")
public class WebListPageGatherJobProvider implements JobProvider {

    private final static Logger logger = LoggerFactory.getLogger(WebListPageGatherJobProvider.class);


    @Autowired(required = false)
    private ResolveFactory resolveFactory;
    @Autowired
    private JsonUtil jsonUtil;
    @Autowired(required = false)
    private WebGatherLinkService webGatherLinkService;

    @Override
    public void execute(JobExecutionContext context, QuartzJob quartzJob) {

        GatherListPageConfig gatherListPageConfig =jsonUtil.parse(quartzJob.getParameters(), GatherListPageConfig.class);
        logger.debug("gatherListPageConfig:  {}", gatherListPageConfig);
        if(gatherListPageConfig ==null){
            logger.error("参数不正确或为空");
            return;
        }
        String pageUrl= StringTools.EMPTY;
        URL url=null;
        String urlStr= gatherListPageConfig.getUrl();

        if(gatherListPageConfig.isHasPage()){
            for (int pageIndex = gatherListPageConfig.getMinPageIndex(); pageIndex <= gatherListPageConfig.getMaxPageIndex(); pageIndex++){
                url=null;
                try {
                    //"https://blog.csdn.net/nnsword"
                    urlStr= gatherListPageConfig.getUrl().replace("${pageIndex}", String.valueOf(pageIndex));
                    logger.info("列表地址：{}", urlStr);
                    gatherListPage(quartzJob, gatherListPageConfig, urlStr);
                } catch (Throwable e) {
                    logger.error("采集["+urlStr+"]异常", e);
                    return;
                }


                try {
                    Thread.sleep(200);
                } catch (InterruptedException e) {
                    logger.error("采集列表时休眠发生异常", e);
                }
            }
        }else {

            try {
                gatherListPage(quartzJob, gatherListPageConfig, urlStr);
            } catch (Throwable e) {
                logger.error("采集["+urlStr+"]异常", e);
                return;
            }

        }
    }

    private void gatherListPage(QuartzJob quartzJob, GatherListPageConfig gatherListPageConfig, String listUrlStr) throws Throwable {
        ContentResolve contentResolveDefault=resolveFactory.getContentResolve(gatherListPageConfig.getResolveType(), listUrlStr);

        Elements eles = contentResolveDefault.listElements(gatherListPageConfig.getListQuery());
        for (Element ele: eles){
            logger.info("链接文本：{}，链接地址：{}", ele.text(),ele.attr("href"));
            //gatherContent(ele, parameters);


            String urlStr = ele.attr("href");
            if(urlStr!=null){
                urlStr = urlStr.trim();
            }
            boolean exist = webGatherLinkService.exist(urlStr);
            if(!exist){
                WebGatherLinkDoc doc=new WebGatherLinkDoc();
                // doc.setLinkId(Md5Utils.Encode(urlStr));
                doc.setJobId(quartzJob.getJobId());
                doc.setGatherStatus(GatherStatusEnums.WAITGATHING);
                doc.setGatherListPageConfig(gatherListPageConfig);
                doc.setLinkTitle(ele.text());
                doc.setLinkUrl(urlStr);
                doc.setLinkGatherTime(DateTime.Now().getDate());
                webGatherLinkService.insertLink(doc);
            }


        }
    }

}
